Note: Human part of the dataset consists of autistic samples only. Control samples were taken from GSE51264
Reads measured in RPKM
All samples belong to the PFC
Metadata downloaded using getGEO, expression data downloaded manually from the Supplementary file GSE59288_exp_mRNA.txt.gz and Probe information extracted using biomaRt
# LOAD METADATA
if(!file.exists('./../Data/Voineagu/GSE59288_exp_mRNA.txt.gz')){
GSE59288 = getGEO('GSE59288', destdir='./../Data/Liu/')
} else {
GSE59288 = getGEO(filename='./../Data/Liu/GSE59288_exp_mRNA.txt.gz')
}
# GEO entry has two datasets, keeping the human one
GPL11154 = GSE59288$`GSE59288-GPL11154_series_matrix.txt.gz`
datMeta = pData(GPL11154)
datMeta$ID = rownames(datMeta)
datMeta$age = round(as.numeric(gsub(' days', '', datMeta$`age:ch1`))/365)
datMeta$age_group = cut(datMeta$age, c(-0.5, 0, 0.6, 10, 20, 30, 40, 50, 60, 70, 80),
labels=c('Fetal','Infant','Child','10-20','20s','30s','40s','50s','60s','70s'))
# LOAD EXPRESSION DATA
gz_file = gzfile('./../Data/Liu/GSE59288_exp_mRNA.txt.gz','rt')
datExpr = read.delim(gz_file) %>% dplyr::select(aut1:aut34)
colnames(datExpr) = rownames(datMeta)
# ANNOTATE PROBES
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv')
# GO Annotations
GO_annotations = read.csv('./../Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
# Combine SFARI and GO information
gene_info = data.frame('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>%
mutate('gene.score'=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`)) %>%
mutate('gene.score'=ifelse(is.na(gene.score), 'None', gene.score))
rm(GSE59288, GPL11154, gz_file, getinfo, mart)
Sex distribution: There are twice as many males than females
table(datMeta$`gender:ch1`)
##
## female male
## 10 24
Age distribution: Subjects between 2 and 60 years old with a mean close to 21
summary(datMeta$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 9.00 18.50 21.24 29.75 60.00
1. Filter probes with start or end position missing
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id
print(paste0('Removed ', sum(!to_keep), 'probe, ', sum(to_keep), ' remaining'))
## [1] "Removed 35probe, 12522 remaining"
2. Filter probes with low expression levels
The density distribution doesn’t seem to suggest any clearly defined threshold. Selecting 1.2, which is the point where the slope changes at the beginning of the distribution
plot_data = data.frame('id'=rownames(datExpr), 'mean_expression' = rowMeans(datExpr))
ggplotly(plot_data %>% ggplot(aes(x=mean_expression)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
geom_vline(xintercept=1.2, color='gray') + scale_x_log10() +
ggtitle('Probe Mean Expression distribution') + theme_minimal())
to_keep = rowMeans(datExpr)>1.2
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep), ' probe, ', sum(to_keep), ' remaining'))
## [1] "Removed 127 probe, 12395 remaining"
3. Filter outlier samples
Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)
Using \(s_{ij}=|bw(i,j)|\) to define connectivity between probes.
Filtering a single data point
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
plot_data = data.frame('ID'=rownames(datMeta), 'sample'=1:length(z.ku), 'distance'=z.ku, 'Age'=datMeta$age_group)
ggplotly(plot_data %>% ggplot(aes(sample, distance, color=Age)) + geom_point() + geom_hline(yintercept=-2, color='gray') + theme_minimal())
print(paste0('Outlier samples: ', paste(as.character(plot_data$ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: GSM1432827"
to_keep = abs(z.ku)<2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
print(paste0('Removed ', sum(!to_keep), ' sample, ', sum(to_keep), ' remaining'))
## [1] "Removed 1 sample, 33 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' probes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 12395 probes and 33 samples"
No clear relation between SFARI score and mean expression/SD. This dataset doesn’t seem to have the same problem as Gandal’s
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(gene_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='gray') +
scale_x_log10() + scale_y_log10() + theme_minimal()
rm(plot_data)
We cannot use DESeq2 normalisation because the entries are not counts, but RPKM, so using limma instead
dge = DGEList(counts=datExpr, samples=datMeta, genes=datProbes)
# Calculate Normalisation factors
dge = calcNormFactors(dge)
# Perform Normaliation
if(max(dge$samples$lib.size)/min(dge$samples$lib.size)>3){
print('Should use voom instead of cpm because the library size ratios are too big')
}
logCPM = cpm(dge, log=TRUE)
# Extract elements
datExpr = logCPM %>% data.frame
datMeta = dge$samples
datProbes = dge$genes
Data seems to be homoscedastic
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))
p1 = meanSdPlot(as.matrix(datExpr), plot=FALSE)$gg + theme_minimal()
p2 = plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) +
geom_smooth(method='lm', color='gray', size=0.5) + theme_minimal()
grid.arrange(p1, p2, ncol=2)
rm(plot_data, p1, p2)
No Batch information available in the metadata!
PCA: There doesn’t seem to be a specific pattern related to gender or age
*There weren’t any patterns in MDS and t-SNE either (MDS was the same as PCA)
pca = datExpr %>% t %>% prcomp
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(datMeta, by='ID') %>% dplyr::select('PC1','PC2','age','age_group','gender.ch1')
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=gender.ch1)) + geom_point() + theme_minimal() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=age)) + geom_point() + theme_minimal() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) + scale_color_viridis() +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
grid.arrange(p1, p2, nrow=1)
rm(pca, plot_data)
First Principal Component explains 86% of the total variance (less than in Gandal and BrainSpan)
There’s a really strong correlation between the mean expression of a probe and the 1st principal component
pca = datExpr %>% prcomp
plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
rm(pca, plot_data)
No clear relation between SFARI score and mean expression/SD. This dataset doesn’t seem to have the same problem as Gandal’s
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(gene_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)